import pandas as pd
import numpy as np
from pandas import read_csv
from pandas import datetime
from matplotlib import pyplot
import holidays
import seaborn as sns
import pandas_profiling
import plotly.offline as py
import plotly.graph_objs as go
import prophet_prediction as prp
py.init_notebook_mode()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
encoding = "utf-8-sig"
#Â Reading Data
df_orig = read_csv('CalIt2.data', header=None, squeeze=True, names = ["flow_id", "date", "time", "count"])
df = df_orig.copy()
df_orig.head(2)
# extract datetime
date_time = pd.to_datetime(df['date'] + ' ' + df['time'])
# drop date and time to merge as datetime
df.drop('date', axis=1, inplace=True)
df.drop('time', axis=1, inplace=True)
df = pd.concat([df, date_time], axis=1)
df.columns = ['flow_id', 'count', 'datetime']
df.head()
pandas_profiling.ProfileReport(df)
- people counter data between the 24/07/2005 - 05/11-2005 (4months). So there is no seaonality information on data.
- 'counts' data has '0' values %50 frequently
- max value of 'count':62. Most of the values between 0-10. Look at the above profiler code: Variables -> Count -> Histogram
- No missing data
'df_in' --> incoming people at main door
'df_out' --> outgoing people at main door
'df_dif' --> incoming - outgoing people at main door
'df_in_out' --> incoming and outgoing people as separete columns
df_in = df.loc[df['flow_id'] == 9]
df_out = df.loc[df['flow_id'] == 7]
df_in.drop('flow_id', inplace=True, axis=1)
df_in.set_index('datetime', inplace=True)
df_out.drop('flow_id', inplace=True, axis=1)
df_out.set_index('datetime', inplace=True)
df_dif = pd.concat([df_in['count'] - df_out['count']], axis=1)
df_in_out = pd.concat([df_in['count'], df_out['count']], axis=1)
df_in_out.columns= ["count_in", "count_out"]
print('df_in')
print(df_in.head(2))
print("--------")
print('df_out')
print(df_out.head(2))
print("--------")
print('df_dif')
print(df_dif.head(2))
print("--------")
print('df_in_out')
print(df_in_out.head(2))
pandas_profiling.ProfileReport(df_in_out)
- 'df_in'
- 'df_out'
- 'df_dif'
forecast_in, m_in = prp.prophet_predict(df_in)
mae_in = prp.forecast_mae(forecast_in)
print(mae_in)
# Plot Trend
fig = m_in.plot_components(forecast_in) # Plot
# Plot Forecast
prp.plot_forecast_ts(forecast_in) # Plot
forecast_out, m_out = prp.prophet_predict(df_out)
mae_out = prp.forecast_mae(forecast_out)
print(mae_out)
# Plot Trend
fig = m_out.plot_components(forecast_out) # Plot
# Plot Forecast
prp.plot_forecast_ts(forecast_out) # Plot
forecast_dif, m_dif = prp.prophet_predict(df_dif)
mae_dif = prp.forecast_mae(forecast_dif)
print(mae_dif)
# Plot Trend
fig = m_dif.plot_components(forecast_dif) # Plot
# Plot Forecast
prp.plot_forecast_ts(forecast_dif) # Plot
corrmat_in_out = df_in_out.corr().abs()
f, axes = pyplot.subplots(1, 1, figsize=(2, 2))
sns.heatmap(corrmat_in_out, vmax=1, vmin=0, square=True,ax=axes)
corrmat_in_out
corrmat_in_out
df_in_regression = df_in.copy()
df_out_regression = df_out.copy()
from datetime import datetime
def make_categorical_hour(hour):
if hour < 4:
return "1"
elif hour < 8:
return "2"
elif hour < 12:
return "3"
elif hour < 16:
return "4"
elif hour < 20:
return "5"
elif hour < 22:
return "6"
elif hour < 25:
return "1"
def add_date_feature(df, df_date):
df['dayofweek'] = df_date.dt.dayofweek.astype(int)
df['hour'] = df_date.dt.hour.astype(int).map(make_categorical_hour)
us_holidays = holidays.UnitedStates(prov=None, state='CA')
df['isholiday'] = [x in us_holidays for x in df_date[date_time.index]]
temp = df_orig.loc[df_orig['flow_id'] == 9]
date_time = pd.to_datetime(temp['date'] + ' ' + temp['time'])
date_time.index = date_time
add_date_feature(df_in_regression, date_time)
add_date_feature(df_out_regression, date_time)
df_in_regression.sample(2)
df_out_regression.sample(2)
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score, explained_variance_score, \
mean_squared_error, mean_absolute_error, \
mean_squared_log_error, median_absolute_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Perceptron
from statsmodels.tsa.arima_model import ARIMA
#import pdb
def forecast_cross_validation(df_data, y_name='count'):
y = df_data[y_name]
X = df_data.drop(y_name, axis=1)
kf = KFold(n_splits=7, shuffle=True) # Define the split
Xv = X.values
yv = y.values
preds = []
scores = []
for train_index, test_index in kf.split(X):
X_train, X_test = Xv[train_index], Xv[test_index]
y_train, y_test = yv[train_index], yv[test_index]
pipline_model = Pipeline([
('feature_selection', SelectFromModel(XGBRegressor(), threshold='0.2*mean')),
('classification', XGBRegressor())
])
# Ensemble
xgb = XGBRegressor()
rf = RandomForestRegressor(n_estimators=30)
lgbm = LGBMRegressor()
#Linear Models
reg = linear_model.LinearRegression()
ridge = linear_model.Ridge(alpha=0.5)
lasso = linear_model.Lasso(alpha=0.1)
elasticNet = linear_model.ElasticNet()
omp = linear_model.OrthogonalMatchingPursuit()
sgd = linear_model.SGDRegressor()
polyReg3 = Pipeline([('poly', PolynomialFeatures(degree=2)),
('feature_selection', SelectFromModel(LinearRegression(fit_intercept=False))),
('linear', LinearRegression(fit_intercept=False))])
models = [pipline_model, reg, ridge, lasso, elasticNet, omp, sgd, polyReg3, lgbm, rf, xgb]
pred_df = pd.DataFrame()
score_df = pd.DataFrame()
for model in models:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_df[type(model).__name__] = y_pred
#pdb.set_trace()
score_df.at[type(model).__name__, "r2"] = r2_score(y_test, y_pred)
score_df.at[type(model).__name__, "rmse"] = np.sqrt(mean_squared_error(y_test, y_pred))
score_df.at[type(model).__name__, "mse"] = mean_squared_error(y_test, y_pred)
score_df.at[type(model).__name__, "mae"] = mean_absolute_error(y_test, y_pred)
score_df.at[type(model).__name__, "exp_var"] = explained_variance_score(y_test, y_pred)
score_df.at[type(model).__name__, "medae_cv"] = median_absolute_error(y_test, y_pred)
scores.append(score_df)
preds.append(pred_df)
n_splits = kf.get_n_splits(X)
#average the splits
scores_average = scores[0]/n_splits
for i in range(1,n_splits):
scores_average =scores_average.add(scores[i]/n_splits, fill_value=0)
scores_average.applymap(lambda x:'{0:.3f}'.format(x))
return scores_average.sort_values(by=['r2'])
forecast_cross_validation(df_in_regression)
forecast_cross_validation(df_out_regression)
df_in.head(2)
df_out.head(2)
df_diff = pd.concat([df_in_out['count_in'] - df_in_out['count_out'] ], axis=1)
df_diff.columns = ["count"]
df_diff.head(3)
from statsmodels.tsa.arima_model import ARIMA
from tqdm import tqdm
def arima_forecast(df):
X = df.astype('float64').values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in tqdm(range(len(test))):
model = ARIMA(history, order=(5,1,0))
model_fit = model.fit(disp=0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
pyplot.plot(test, color='blue')
pyplot.plot(predictions, color='yellow')
pyplot.show()
arima_forecast(df_in)
arima_forecast(df_out)
arima_forecast(df_diff)
In regression:
- add before days (Shifting)
- Add prophet trend informations
- substract incoming people and outgoing people for predictions and look at error
- determine anomalies corresponding to predictions. If error is high there may be anomaly
IN Arıma can be seen the anomalies with graphic
- Research..
- Entegrate with the 'events' data: build a pipeline to find event start and end hours.
- Combine all of the informations
#from keras.models import Sequential
#from keras.layers import LSTM
#from keras.layers import Dense
## create and fit the LSTM network
#lstm = Sequential()
#lstm.add(LSTM(30))
#lstm.add(Dense(2))
#lstm.compile(loss='mean_squared_error', optimizer='adam')